# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
# Viz
%matplotlib inline
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots
sales=pd.read_csv("Orders-Table 1.csv")
#formatting the date column correctly
sales.date=sales.OrderDate.apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%y'))
# check
print(sales.info())
sales['orderyear'] = sales.date.dt.year
sales['ordermonth'] = sales.date.dt.month
sales['monthyear'] = sales.apply(lambda row :
(row.orderyear,row.ordermonth),
axis=1)
newdf = sales[['monthyear', 'Sales', 'Profit']].copy()
newdf.groupby([newdf.monthyear]).sum().plot()
# matplotlibdf = pd.read_csv("Orders-Table 1.csv",parse_dates=['OrderDate'], index_col='OrderDate')
matplotlibdf = sales[['OrderDate', 'Sales', 'Profit']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
#matplotlibdf.Sales.resample('M',how='sum')
#matplotlibdf
linechartData = matplotlibdf.groupby([pd.Grouper(freq='MS',key='OrderDate')]).sum()
linechartData.plot(figsize=(20,10))
# Plotting Rolling mean
linechartData.rolling(window=3).mean().plot()
import statsmodels.api as sm
# multiplicative
res = sm.tsa.seasonal_decompose(linechartData.Sales,freq=12,model="multiplicative")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()
# TIME SERIES
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
# Stationarity tests
def test_stationarity(timeseries):
#Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
print("For Sales")
test_stationarity(linechartData.iloc[:,0].values)
print("\n\nFor Profit")
test_stationarity(linechartData.iloc[:,1].values)
Negative test statistic and p value less than 0.05 means values are stationary.
# to remove trend
from pandas import Series as Series
# create a differenced series
def difference(dataset, interval=1):
diff = list()
for i in range(interval, len(dataset)):
value = dataset[i] - dataset[i - interval]
diff.append(value)
return Series(diff)
# invert differenced forecast
def inverse_difference(last_ob, value):
return value + last_ob
plt.figure(figsize=(16,16))
plt.subplot(311)
plt.plot(linechartData.Profit)
plt.plot()
plt.subplot(312)
plt.title('After De-trend')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(linechartData.Profit)
plt.plot(new_ts)
plt.plot()
plt.subplot(313)
plt.title('After De-seasonalization')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(linechartData.Profit,12) # assuming the seasonality is 12 months long
plt.plot(new_ts)
plt.plot()
test_stationarity(new_ts)
def tsplot(y, lags=None, figsize=(10, 8), style='bmh',title=''):
if not isinstance(y, pd.Series):
y = pd.Series(y)
with plt.style.context(style):
fig = plt.figure(figsize=figsize)
#mpl.rcParams['font.family'] = 'Ubuntu Mono'
layout = (3, 2)
ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
acf_ax = plt.subplot2grid(layout, (1, 0))
pacf_ax = plt.subplot2grid(layout, (1, 1))
qq_ax = plt.subplot2grid(layout, (2, 0))
pp_ax = plt.subplot2grid(layout, (2, 1))
y.plot(ax=ts_ax)
ts_ax.set_title(title)
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
sm.qqplot(y, line='s', ax=qq_ax)
qq_ax.set_title('QQ Plot')
scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)
plt.tight_layout()
return
# Simulate an AR(1) process with alpha = 0.6
np.random.seed(1)
n_samples = int(1000)
a = 0.6
x = w = np.random.normal(size=n_samples)
for t in range(n_samples):
x[t] = a*x[t-1] + w[t]
limit=12
_ = tsplot(x, lags=limit,title="AR(1)process")
# Simulate an AR(2) process
n = int(1000)
alphas = np.array([.444, .333])
betas = np.array([0.])
# Python requires us to specify the zero-lag value which is 1
# Also note that the alphas for the AR model must be negated
# We also set the betas for the MA equal to 0 for an AR(p) model
# For more information see the examples at statsmodels.org
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
ar2 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n)
_ = tsplot(ar2, lags=12,title="AR(2) process")
# Simulate an MA(1) process
n = int(1000)
# set the AR(p) alphas equal to 0
alphas = np.array([0.])
betas = np.array([0.8])
# add zero-lag and negate alphas
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
ma1 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n)
limit=12
_ = tsplot(ma1, lags=limit,title="MA(1) process")
# Simulate MA(2) process with betas 0.6, 0.4
n = int(1000)
alphas = np.array([0.])
betas = np.array([0.6, 0.4])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
ma3 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n)
_ = tsplot(ma3, lags=12,title="MA(2) process")
# Simulate an ARMA(2, 2) model with alphas=[0.5,-0.25] and betas=[0.5,-0.3]
max_lag = 12
n = int(5000) # lots of samples to help estimates
burn = int(n/10) # number of samples to discard before fit
alphas = np.array([0.8, -0.65])
betas = np.array([0.5, -0.7])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
arma22 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n, burnin=burn)
_ = tsplot(arma22, lags=max_lag,title="ARMA(2,2) process")
best_aic = np.inf
best_order = None
best_mdl = None
rng = range(5)
for i in rng:
for j in rng:
try:
tmp_mdl = smt.ARMA(new_ts.values, order=(i, j)).fit(method='mle', trend='nc')
tmp_aic = tmp_mdl.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_order = (i, j)
best_mdl = tmp_mdl
except: continue
print('aic: {:6.5f} | order: {}'.format(best_aic, best_order))
best_mdl.predict()
matplotlibdf = sales[['OrderDate', 'Sales']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
linechartData = matplotlibdf.groupby([pd.Grouper(freq='MS',key='OrderDate')]).sum()
linechartData = linechartData.reset_index()
from fbprophet import Prophet
#prophet reqiures a pandas df at the below config
# ( date column named as DS and the value column as Y)
linechartData.columns=['ds','y']
model = Prophet( yearly_seasonality=True) #instantiate Prophet with only yearly seasonality as our data is monthly
model.fit(linechartData) #fit the model with your dataframe
# predict for five months in the furure and MS - month start is the frequency
future = model.make_future_dataframe(periods = 5, freq = 'MS')
# now lets make the forecasts
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
model.plot(forecast)
model.plot_components(forecast)
# let's try to make weekly predictions
matplotlibdf = sales[['OrderDate', 'Sales']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
#matplotlibdf.reset_index().set_index('OrderDate').resample('D', how='sum')
linechartData = matplotlibdf.groupby([pd.Grouper(freq='D',key='OrderDate')]).sum().reset_index().set_index('OrderDate')
#linechartData.reset_index()
#linechartData.set_index('OrderDate')
#linechartData = linechartData.ffill().reset_index()
#linechartData
#linechartData.sort_values(by=['OrderDate'], ascending=[True])
idx = pd.date_range('01-03-2014', '12-30-2017')
linechartData = linechartData.reindex(idx, fill_value=0)
linechartData = linechartData.reset_index()
from fbprophet import Prophet
#prophet reqiures a pandas df at the below config
# ( date column named as DS and the value column as Y)
linechartData.columns=['ds','y']
model = Prophet( yearly_seasonality=True) #instantiate Prophet with only yearly seasonality as our data is monthly
model.fit(linechartData) #fit the model with your dataframe
future = model.make_future_dataframe(periods = 30, freq = 'D')
# now lets make the forecasts
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
model.plot(forecast)
def me(df):
return((df['yhat'] - df['y']).sum()/len(df['yhat']))
def mse(df):
return((df['yhat'] - df['y']).pow(2).sum()/len(df))
def rmse(df):
return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df)))
def mae(df):
return((df['yhat'] - df['y']).abs().sum()/len(df))
def mpe(df):
return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df)))
def mape(df):
return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df)))
def all_metrics(model, df_cv = None):
"""Compute model fit metrics for time series.
Computes the following metrics about each time series that has been through
Cross Validation;
Mean Error (ME)
Mean Squared Error (MSE)
Root Mean Square Error (RMSE,
Mean Absolute Error (MAE)
Mean Percentage Error (MPE)
Mean Absolute Percentage Error (MAPE)
Parameters
----------
df: A pandas dataframe. Contains y and yhat produced by cross-validation
Returns
-------
A dictionary where the key = the error type, and value is the value of the error
"""
df = []
if df_cv is not None:
df = df_cv
else:
# run a forecast on your own data with period = 0 so that it is in-sample data onlyl
#df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']]
df = (model
.history[['ds', 'y']]
.merge(
model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']],
how='inner', on='ds'
)
)
if 'yhat' not in df.columns:
raise ValueError(
'Please run Cross-Validation first before computing quality metrics.')
return {
'ME':me(df),
'MSE':mse(df),
'RMSE': rmse(df),
'MAE': mae(df),
'MPE': mpe(df),
'MAPE': mape(df)
}
matplotlibdf = sales[['OrderDate', 'Customer ID' , 'Sales']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
f = {'Sales':['sum', 'mean', 'count'], 'Customer ID':['nunique']}
linechartData = matplotlibdf.groupby([pd.Grouper(freq='D',key='OrderDate')]).agg(f)
linechartData['AvgSalePerCustomer'] = linechartData['Sales']['sum']/linechartData['Customer ID']['nunique']
print(linechartData)
matplotlibdf = sales[['OrderDate', 'Customer ID' , 'Sales', 'City']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
f = {'Sales':['sum'], 'Customer ID':['nunique']}
linechartData = matplotlibdf.groupby('City').agg(f)
linechartData['AvgSalePerCustomer'] = linechartData['Sales']['sum']/linechartData['Customer ID']['nunique']
print(linechartData.sort_values('AvgSalePerCustomer', ascending=False))
linechartData['AvgSalePerCustomer'].describe()
# ECDF: empirical cumulative distribution function
from statsmodels.distributions.empirical_distribution import ECDF
sns.set(style = "ticks")# to format into seaborn
c = '#386B7F' # basic color for plots
plt.figure(figsize = (12, 6))
plt.subplots_adjust(hspace=.5)
plt.subplot(311)
cdf = ECDF(linechartData['Sales']['sum'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = c);
plt.xlabel('Sales'); plt.ylabel('ECDF');
# plot second ECDF
plt.subplot(312)
cdf = ECDF(linechartData['Customer ID']['nunique'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = c);
plt.xlabel('Customers');
# plot second ECDF
plt.subplot(313)
cdf = ECDF(linechartData['AvgSalePerCustomer'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = c);
plt.xlabel('Sale per Customer');
matplotlibdf = sales[['OrderDate', 'Sub-Category' , 'Sales', 'Customer ID', 'Profit']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf.groupby('Sub-Category')['Sales'].describe()
f = {'Sales':['sum'], 'Customer ID':['nunique'], 'Profit':['sum']}
matplotlibdf.groupby('Sub-Category').agg(f)
def f(row):
if float(row['Discount']) > 0:
return 1
else:
return 0
matplotlibdf = sales[['OrderDate', 'Category' , 'Sales', 'Discount', 'Region', 'Customer ID']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday
print(matplotlibdf)
sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales",
col = 'Region', # per store type in cols
palette = 'plasma',
hue = 'Region',
row = 'is_discount', # per promo in the store in rows
color = c,
estimator=sum)
# Show point estimates and confidence intervals using scatter plot glyphs.
# A point plot represents an estimate of central tendency for a numeric variable by the
# position of scatter plot points and provides some indication of the uncertainty around that
# bestimate using error bars.
sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales",
col = 'weekday', # per store type in cols
palette = 'plasma',
hue = 'Region',
row = 'Region', # per store type in rows
color = c,
estimator=sum)
def f(row):
if float(row['Discount']) > 0:
return 1
else:
return 0
matplotlibdf = sales[['OrderDate', 'Segment' , 'Sales', 'Discount', 'Customer ID']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday
sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales",
col = 'Segment', # per store type in cols
palette = 'plasma',
hue = 'Segment',
row = 'is_discount', # per promo in the store in rows
color = c,
estimator=sum)
def f(row):
if float(row['Discount']) > 0:
return 1
else:
return 0
matplotlibdf = sales[['OrderDate', 'Sub-Category' , 'Sales', 'Discount', 'Customer ID']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday
sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales",
col = 'Sub-Category', # per store type in cols
palette = 'plasma',
hue = 'Sub-Category',
row = 'is_discount', # per promo in the store in rows
color = c,
estimator=sum)

def f(row):
if float(row['Discount']) > 0:
return 1
else:
return 0
matplotlibdf = sales.copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday
corr_all = matplotlibdf.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr_all, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize = (11, 9))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_all, mask = mask,
square = True, linewidths = .5, ax = ax, cmap = "BuPu")
#plt.show()
# store data
store = pd.read_csv("store.csv",
low_memory = False)
store.isnull().sum()
store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace = True)
store.fillna(0, inplace = True)
# importing data
df = pd.read_csv("train.csv",low_memory = False)
df.info()
# remove closed stores and those with no sales
df = df[(df["Open"] != 0) & (df['Sales'] != 0)]
# sales for the store number 1 (StoreType C)
sales = df[df.Store == 1].loc[:, ['Date', 'Sales']]
# reverse to the order: from 2013 to 2015
sales = sales.sort_index(ascending = False)
# to datetime64
sales['Date'] = pd.DatetimeIndex(sales['Date'])
sales.dtypes
# from the prophet documentation every variables should have specific names
sales = sales.rename(columns = {'Date': 'ds',
'Sales': 'y'})
sales.head()
# plot daily sales
ax = sales.set_index('ds').plot(figsize = (12, 4), color = c)
ax.set_ylabel('Daily Number of Sales')
ax.set_xlabel('Date')
plt.show()
# create holidays dataframe
state_dates = df[(df.StateHoliday == 'a') | (df.StateHoliday == 'b') & (df.StateHoliday == 'c')].loc[:, 'Date'].values
school_dates = df[df.SchoolHoliday == 1].loc[:, 'Date'].values
state = pd.DataFrame({'holiday': 'state-holiday',
'ds': pd.to_datetime(state_dates)})
school = pd.DataFrame({'holiday': 'school-holiday',
'ds': pd.to_datetime(school_dates)})
holidays = pd.concat((state, school))
holidays
model = Prophet(interval_width = 0.95, holidays = holidays)
model.fit(sales)
# dataframe that extends into future 6 weeks
future_dates = model.make_future_dataframe(periods = 6*7)
print("First week to forecast.")
future_dates.tail(7)
# predictions
forecast = model.predict(future_dates)
# preditions for last week
# forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(7)
model.plot(forecast);
model.plot_components(forecast);
all_metrics(model)
The first plot shows that the monthly sales of store number 1 has been linearly decreasing over time and the second shows the holiays gaps included in the model. The third plot highlights the fact that the weekly volume of last week sales peaks towards the Monday of the next week, while the forth plot shows that the most buzy season occurs during the Christmas holidays.
df.head()
train = pd.read_csv("train.csv",parse_dates = True, index_col = 'Date')
train['Year'] = train.index.year
train['Month'] = train.index.month
train['Day'] = train.index.day
train['WeekOfYear'] = train.index.weekofyear
train = train.reset_index()
df_new = pd.merge(train, store, how='left', on='Store')
df_new.columns.values
df_new = df_new.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2SinceWeek',
'Promo2SinceYear', 'PromoInterval'], axis=1)
df_new.head()
df_new = df_new[df_new.Open != 0]
df_new = df_new.drop('Open', axis=1) #it's useless now
df_new = df_new[df_new.Sales != 0]
# Converting state holiday to string as other values are in string (a,b,c)
df_new.loc[df_new.StateHoliday == 0,'StateHoliday'] = df_new.loc[df_new.StateHoliday == 0,'StateHoliday'].astype(str)
# calculate weekly average sales
sales = df_new[['Year','Month','Store','Sales']].groupby(['Year','Month','Store']).mean()
sales = sales.rename(columns={'Sales':'AvgSales'})
sales = sales.reset_index()
sales.head()
# Time to merge them!
df_new['sales_key']=df_new['Year'].map(str) + df_new['Month'].map(str) + df_new['Store'].map(str)
sales['sales_key']=sales['Year'].map(str) + sales['Month'].map(str) + sales['Store'].map(str)
# drop extra columns
sales = sales.drop(['Year','Month','Store'], axis=1)
# merge
df_new = pd.merge(df_new, sales, how='left', on=('sales_key'))
# Avg number of customers
cust = df_new[['Year','Month','Store','Customers']].groupby(['Year','Month', 'Store']).mean()
cust = cust.rename(columns={'Customers':'AvgCustomer'})
cust = cust.reset_index()
df_new['cust_key']=df_new['Year'].map(str) + df_new['Month'].map(str) + df_new['Store'].map(str)
cust['cust_key']=cust['Year'].map(str) + cust['Month'].map(str) + cust['Store'].map(str)
df_new = df_new.drop('Customers', axis=1)# drop extra columns
cust = cust.drop(['Year', 'Month', 'Store'], axis=1)
df_new = pd.merge(df_new, cust, how="left", on=('cust_key'))
df_new['StateHoliday'] = df_new.StateHoliday.map({'0':0, 'a':1 ,'b' : 1,'c': 1})
df_new = df_new.drop(['cust_key','sales_key','Store', 'Date'], axis=1)
X = df_new.drop('Sales', axis=1)
y = df_new.Sales
xd = X.copy()
xd = pd.get_dummies(xd)
xd.head()
# label nominal variables for tree based regression
xl = X.copy()
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
xl.StateHoliday = label.fit_transform(xl.StateHoliday)
xl.Assortment = label.fit_transform(xl.Assortment)
xl.StoreType = label.fit_transform(xl.StoreType)
# train test split
# split training and test datasets
from sklearn.cross_validation import train_test_split
xd_train,xd_test,yd_train,yd_test = train_test_split(xd,y,test_size=0.3, random_state=1)
xl_train,xl_test,yl_train,yl_test = train_test_split(xl,y,test_size=0.3, random_state=1)
xd_train.head()
from sklearn.linear_model import LinearRegression
lin= LinearRegression()
linreg = lin.fit(xd_train, yd_train)
# definte RMSE function
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(x, y):
return sqrt(mean_squared_error(x, y))
# definte MAPE function
def mape(x, y):
return np.mean(np.abs((x - y) / x)) * 100
# get cross validation scores
yd_predicted = linreg.predict(xd_train)
yd_test_predicted = linreg.predict(xd_test)
linear_train_score = linreg.score(xd_train, yd_train)
linear_test_score = linreg.score(xd_test, yd_test)
linear_train_mape = mape(yd_train, yd_predicted)
linear_test_mape = mape(yd_test, yd_test_predicted)
print("Regresion Model Score" , ":" , linear_train_score , "," ,
"Out of Sample Test Score" ,":" , linear_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
"Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", linear_train_mape,
"Testing MAPE", ":", linear_test_mape)
import matplotlib
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Linear")
from sklearn.linear_model import BayesianRidge
rdg = BayesianRidge()
rdgreg = rdg.fit(xd_train, yd_train)
yd_predicted = rdgreg.predict(xd_train)
yd_test_predicted = rdgreg.predict(xd_test)
ridge_train_score = rdgreg.score(xd_train, yd_train)
ridge_test_score = rdgreg.score(xd_test, yd_test)
ridge_train_mape = mape(yd_train, yd_predicted)
ridge_test_mape = mape(yd_test, yd_test_predicted)
# validation
print("Regresion Model Score" , ":" , ridge_train_score , "," ,
"Out of Sample Test Score" ,":" , ridge_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
"Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", ridge_train_mape,
"Testing MAPE", ":", ridge_test_mape)
import matplotlib
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Bayesian Ridge")
from sklearn.linear_model import LassoLars
las = LassoLars(alpha=0.3, fit_intercept=False, normalize=True)
lasreg = las.fit(xd_train, yd_train)
yd_predicted = lasreg.predict(xd_train)
yd_test_predicted = lasreg.predict(xd_test)
lasso_train_score = lasreg.score(xd_train, yd_train)
lasso_test_score = lasreg.score(xd_test, yd_test)
lasso_train_mape = mape(yd_train, yd_predicted)
lasso_test_mape = mape(yd_test, yd_test_predicted)
print("Regresion Model Score" , ":" , lasso_train_score , "," ,
"Out of Sample Test Score" ,":" , lasso_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
"Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", lasso_train_mape,
"Testing MAPE", ":", lasso_test_mape)
import matplotlib
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in LARS Lasso")
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(min_samples_leaf=20)
treereg = tree.fit(xl_train, yl_train)
yl_predicted = treereg.predict(xl_train)
yl_test_predicted = treereg.predict(xl_test)
decision_train_score = treereg.score(xl_train, yl_train)
decision_test_score = treereg.score(xl_test, yl_test)
decision_train_mape = mape(yl_train, yl_predicted)
decision_test_mape = mape(yl_test, yl_test_predicted)
print("Regresion Model Score" , ":" , decision_train_score , "," ,
"Out of Sample Test Score" ,":" , decision_test_score)
print("Training RMSE", ":", rmse(yl_train, yl_predicted),
"Testing RMSE", ":", rmse(yl_test, yl_test_predicted))
print("Training MAPE", ":", decision_train_mape,
"Testing MAPE", ":", decision_test_mape)
import matplotlib
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yl_test_predicted, "true":yl_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Decision Tree")
from sklearn.ensemble import RandomForestRegressor
rdf = RandomForestRegressor(n_estimators=30)
rdfreg = rdf.fit(xl_train, yl_train)
from sklearn.ensemble import RandomForestRegressor
rdf = RandomForestRegressor(n_estimators=30)
rdfreg = rdf.fit(xl_train, yl_train)
yl_predicted = rdfreg.predict(xl_train)
yl_test_predicted = rdfreg.predict(xl_test)
randomforest_train_score = rdfreg.score(xl_train, yl_train)
randomforest_test_score = rdfreg.score(xl_test, yl_test)
randomforest_train_mape = mape(yl_train, yl_predicted)
randomforest_test_mape = mape(yl_test, yl_test_predicted)
print("Regresion Model Score" , ":" , randomforest_train_score , "," ,
"Out of Sample Test Score" ,":" , randomforest_test_score)
print("Training RMSE", ":", rmse(yl_train, yl_predicted),
"Testing RMSE", ":", rmse(yl_test, yl_test_predicted))
print("Training MAPE", ":", randomforest_train_mape,
"Testing MAPE", ":", randomforest_test_mape)
import matplotlib
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yl_test_predicted, "true":yl_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Random Forest Tree")
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors = 30)
knnreg = knn.fit(xd_train, yd_train)
yd_predicted = knnreg.predict(xd_train)
yd_test_predicted = knnreg.predict(xd_test)
knn_train_score = knnreg.score(xd_train, yd_train)
knn_test_score = knnreg.score(xd_test, yd_test)
knn_train_mape = mape(yd_train, yd_predicted)
knn_test_mape = mape(yd_test, yd_test_predicted)
print("Regresion Model Score" , ":" , knn_train_score , "," ,
"Out of Sample Test Score" ,":" , knn_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
"Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", knn_train_mape,
"Testing MAPE", ":", knn_test_mape)
import matplotlib
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Knn")
train_error=[linear_train_mape,ridge_train_mape,lasso_train_mape,decision_train_mape,randomforest_train_mape, knn_train_mape]
test_error=[linear_test_mape,ridge_test_mape,lasso_test_mape,decision_test_mape,randomforest_test_mape, knn_test_mape]
col={'Train Error':train_error,'Test Error':test_error}
models=['Linear Regression','Bayesian Ridge Regression','LARS Lasso Regression','Decision Tree','Random Forest', 'K-Nearest Neighbours']
df=pd.DataFrame(data=col,index=models)
df.plot(kind='bar')
train_score=[linear_train_score,ridge_train_score,lasso_train_score,decision_train_score,randomforest_train_score, knn_train_score]
test_score=[linear_test_score,ridge_test_score,lasso_test_score,decision_test_score,randomforest_test_score, knn_test_score]
col={'Train Score':train_score,'Test Score':test_score}
models=['Linear Regression','Bayesian Ridge Regression','LARS Lasso Regression','Decision Tree','Random Forest', 'K-Nearest Neighbours']
df=pd.DataFrame(data=col,index=models)
df.plot(kind='bar')
features = xl_train.columns
importances = rdfreg.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(8,10))
plt.title('Feature Importances', fontsize=20)
plt.barh(range(len(indices)), importances[indices], color='blue', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
def rmspe(y, yhat):
return np.sqrt(np.mean((yhat / y-1) ** 2))
def rmspe_xg(yhat, y):
y = np.expm1(y.get_label())
yhat = np.expm1(yhat)
return "rmspe", rmspe(y, yhat)
Tuning Parameters
# base parameters
params = {
'booster': 'gbtree',
'objective': 'reg:linear', # regression task
'subsample': 0.8, # 80% of data to grow trees and prevent overfitting
'colsample_bytree': 0.85, # 85% of features used
'eta': 0.1,
'max_depth': 10,
'seed': 42} # for reproducible results
import xgboost as xgb
from xgboost.sklearn import XGBRegressor # wrapper
# XGB with xgboost library
yl_train = np.log(yl_train)
yl_test = np.log(yl_test)
dtrain = xgb.DMatrix(xl_train, yl_train)
dtest = xgb.DMatrix(xl_test, yl_test)
watchlist = [(dtrain, 'train'), (dtest, 'test')]
xgb_model = xgb.train(params, dtrain, 300, evals = watchlist,
early_stopping_rounds = 50, feval = rmspe_xg, verbose_eval = True)
# XGB with sklearn wrapper
# the same parameters as for xgboost model
params_sk = {'max_depth': 10,
'n_estimators': 300, # the same as num_rounds in xgboost
'objective': 'reg:linear',
'subsample': 0.8,
'colsample_bytree': 0.85,
'learning_rate': 0.1,
'seed': 42}
skrg = XGBRegressor(**params_sk)
skrg.fit(xl_train, yl_train)
import scipy.stats as st
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params_grid = {
'learning_rate': st.uniform(0.01, 0.3),
'max_depth': list(range(10, 20, 2)),
'gamma': st.uniform(0, 10),
'reg_alpha': st.expon(0, 50)}
search_sk = RandomizedSearchCV(skrg, params_grid, cv = 5) # 5 fold cross validation
search_sk.fit(xl_train, yl_train)
# best parameters
print(search_sk.best_params_); print(search_sk.best_score_)
# with new parameters
params_new = {
'booster': 'gbtree',
'objective': 'reg:linear',
'subsample': 0.8,
'colsample_bytree': 0.85,
'eta': 0.044338624448041611,
'max_depth': 16,
'gamma': 0.80198330585415034,
'reg_alpha': 23.008226565535971,
'seed': 42}
model_final = xgb.train(params_new, dtrain, 300, evals = watchlist,
early_stopping_rounds = 50, feval = rmspe_xg, verbose_eval = True)
xgb.plot_tree(xgb_model, num_trees=2)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(150, 100)
fig.savefig('tree.png')
xgb.plot_importance(xgb_model)